Just checking my ability to load the AHP data and see overall structure with a PCA

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(metamoRph)

proteome_df <- data.table::fread('../data/AhpDb_PsmMatrix.csv')
## Warning in data.table::fread("../data/AhpDb_PsmMatrix.csv"): Discarded
## single-line footer: <<(18869 rows affected)>>
proteome_df <- proteome_df[-1,] # remove visual formatting line
# make numeric
proteome_matrix <- proteome_df[,-1] %>% as.matrix()
mode(proteome_matrix) = "numeric"
## Warning in mde(x): NAs introduced by coercion
row.names(proteome_matrix) <- proteome_df$Accession
proteome_meta <- read_delim('../data/AhpDb_ProteinSummary.csv', delim = 'asdf') 
## Rows: 1649 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "asdf"
## chr (1): UniProt_Id                                                         ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(proteome_meta) <- 'one'

# some wacky stuff to handle the ... interesting ... data format
proteome_meta <- proteome_meta %>% 
  mutate_if(is.character, utf8::utf8_encode) %>% 
  separate(one, into = 
             c("UniProt_Id", "Gene_Symbol", "Protein_Name", "Gene_Names", "Accession", "PsmSum", "MeanPsm", "PsmPresent", "PctPresent"), 
           sep = '\\s\\s+,|,\\s\\s+')
## Warning: Expected 9 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 1649].
# first row is visual formatting
# last row is empty
proteome_meta <- proteome_meta[-c(1,1649),]

sample_meta <- read_csv('../data/AhpDb_ClinicalData.csv')
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 313 Columns: 62
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (62): SampleId, CD, Sex, Age, Race, Ethnicity, Iop, IopMethod, OdOs, Inc...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_meta <- sample_meta[-c(1,314),]

PSM Sum Distributions

Should remove the handful of samples with a PSM sum less than 2000 and sum normalize (a la TPM)

proteome_matrix %>% as_tibble() %>% select(contains("AH")) %>% colSums(na.rm = TRUE) %>% density() %>% plot()

proteome_matrix %>% as_tibble() %>% select(contains("AH")) %>% colSums(na.rm = TRUE) %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     687   10093   11469   11783   13115   23364

Remove low sum PSM samples

remove_id <-  proteome_matrix %>% as_tibble() %>% select(contains("AH")) %>% colSums(na.rm = TRUE) %>% enframe() %>% filter(value < 2000)
proteome_tib <- proteome_matrix %>% as_tibble(rownames = 'Accession') %>% select(-contains(remove_id$name))

PCA

Eight samples are REALLY different than the rest

proteome_mat_clean <- proteome_tib %>% select(contains("AH")) %>% as.matrix()
row.names(proteome_mat_clean) <- proteome_tib$Accession
aligned_sample_meta <- colnames(proteome_mat_clean) %>% enframe(value = 'SampleId') %>% select(-name) %>% 
  left_join(sample_meta)
## Joining with `by = join_by(SampleId)`
proteome_mat_clean[is.na(proteome_mat_clean)] <- 0
pca_mm <- run_pca(proteome_mat_clean,aligned_sample_meta)
## Sample CPM scaling
## log1p scaling
## prcomp start
plot <- pca_mm$PCA$x %>% as_tibble(rownames = 'SampleId') %>% 
  left_join(pca_mm$meta, by = 'SampleId') %>% 
  ggplot(aes(x=PC1,y=PC2,color = CD, label = SampleId)) + 
  geom_point() +
  scale_color_manual(values = pals::alphabet() %>% unname()) +
  cowplot::theme_cowplot()
plotly::ggplotly(plot)

PCA Again

Removing those eight samples and AH602 (top left) and redoing the PCA

Looks reasonable now

proteome_mat_clean_filter <- proteome_mat_clean[,pca_mm$PCA$x[,c('PC1','PC2')] %>% as_tibble(rownames = 'sample') %>%  filter(PC1 < 20, PC2 < 20) %>% pull(sample)]

aligned_sample_meta <- colnames(proteome_mat_clean_filter) %>% enframe(value = 'SampleId') %>% select(-name) %>% 
  left_join(sample_meta)
## Joining with `by = join_by(SampleId)`
pca_mm <- run_pca(proteome_mat_clean_filter,aligned_sample_meta)
## Sample CPM scaling
## log1p scaling
## prcomp start
plot <- pca_mm$PCA$x %>% as_tibble(rownames = 'SampleId') %>% 
  left_join(pca_mm$meta, by = 'SampleId') %>% 
  ggplot(aes(x=PC1,y=PC2,color = CD, label = SampleId)) + 
  geom_point() +
  scale_color_manual(values = pals::alphabet() %>% unname()) +
  cowplot::theme_cowplot()
plotly::ggplotly(plot)

Output cleaned data

save(proteome_mat_clean_filter, aligned_sample_meta, file = '../data/ahpdb_cleaned_data.Rdata')

Session Info

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] metamoRph_0.2.0 lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0  
##  [5] dplyr_1.1.2     purrr_1.0.1     readr_2.1.4     tidyr_1.3.0    
##  [9] tibble_3.2.1    ggplot2_3.4.2   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##   [1] bitops_1.0-7                rlang_1.1.1                
##   [3] magrittr_2.0.3              matrixStats_1.0.0          
##   [5] compiler_4.3.0              DelayedMatrixStats_1.21.0  
##   [7] vctrs_0.6.3                 maps_3.4.1                 
##   [9] pkgconfig_2.0.3             crayon_1.5.2               
##  [11] fastmap_1.1.1               ellipsis_0.3.2             
##  [13] XVector_0.40.0              labeling_0.4.2             
##  [15] scuttle_1.9.4               utf8_1.2.3                 
##  [17] rmarkdown_2.23              tzdb_0.4.0                 
##  [19] bit_4.0.5                   xfun_0.40                  
##  [21] bluster_1.9.1               zlibbioc_1.46.0            
##  [23] cachem_1.0.8                pals_1.7                   
##  [25] beachmat_2.15.0             GenomeInfoDb_1.36.1        
##  [27] jsonlite_1.8.7              highr_0.10                 
##  [29] DelayedArray_0.26.7         BiocParallel_1.33.11       
##  [31] irlba_2.3.5.1               parallel_4.3.0             
##  [33] cluster_2.1.4               R6_2.5.1                   
##  [35] bslib_0.5.0                 stringi_1.7.12             
##  [37] limma_3.56.1                GenomicRanges_1.52.0       
##  [39] jquerylib_0.1.4             Rcpp_1.0.11                
##  [41] SummarizedExperiment_1.30.2 knitr_1.43                 
##  [43] IRanges_2.34.1              Matrix_1.5-4.1             
##  [45] igraph_1.4.3                timechange_0.2.0           
##  [47] tidyselect_1.2.0            rstudioapi_0.14            
##  [49] dichromat_2.0-0.1           abind_1.4-5                
##  [51] yaml_2.3.7                  codetools_0.2-19           
##  [53] lattice_0.21-8              Biobase_2.60.0             
##  [55] withr_2.5.0                 evaluate_0.21              
##  [57] pillar_1.9.0                MatrixGenerics_1.12.3      
##  [59] stats4_4.3.0                plotly_4.10.1              
##  [61] generics_0.1.3              vroom_1.6.3                
##  [63] RCurl_1.98-1.12             S4Vectors_0.38.1           
##  [65] hms_1.1.3                   sparseMatrixStats_1.11.1   
##  [67] munsell_0.5.0               scales_1.2.1               
##  [69] glue_1.6.2                  metapod_1.7.0              
##  [71] mapproj_1.2.11              lazyeval_0.2.2             
##  [73] tools_4.3.0                 BiocNeighbors_1.17.1       
##  [75] data.table_1.14.8           ScaledMatrix_1.8.1         
##  [77] locfit_1.5-9.7              scran_1.27.1               
##  [79] cowplot_1.1.1               grid_4.3.0                 
##  [81] crosstalk_1.2.0             edgeR_3.42.2               
##  [83] colorspace_2.1-0            SingleCellExperiment_1.22.0
##  [85] GenomeInfoDbData_1.2.10     BiocSingular_1.15.0        
##  [87] cli_3.6.1                   rsvd_1.0.5                 
##  [89] fansi_1.0.4                 viridisLite_0.4.2          
##  [91] S4Arrays_1.0.5              gtable_0.3.3               
##  [93] sass_0.4.7                  digest_0.6.33              
##  [95] BiocGenerics_0.46.0         dqrng_0.3.0                
##  [97] htmlwidgets_1.6.2           htmltools_0.5.5            
##  [99] lifecycle_1.0.3             httr_1.4.6                 
## [101] statmod_1.5.0               bit64_4.0.5